Assignment 03

Author
Affiliation

Ryan Martin

Boston University

Published

September 24, 2025

Modified

September 24, 2025

from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "svg"
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("./data/LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")

# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")

#df.printSchema() # comment this line when rendering the submission
#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/24 21:07:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/24 21:07:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
[Stage 0:===========================================================(1 + 0) / 1]                                                                                [Stage 1:>                                                          (0 + 1) / 1]                                                                                

1 1. Data Preperation

df = df.withColumn("SALARY", col("SALARY").cast("float")) \
      .withColumn("SALARY_FROM", col("SALARY_FROM").cast("float")) \
      .withColumn("SALARY_TO", col("SALARY_TO").cast("float")) \
      .withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float")) \
      .withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float")) \
      
def compute_median(sdf, col_name):
  q = sdf.approxQuantile(col_name, [0.5], 0.01)
  return q[0] if q else None

median_from = compute_median(df, "SALARY_FROM")
median_to = compute_median(df, "SALARY_TO")
median_salary = compute_median(df, "SALARY")

print("Medians:", median_from, median_to, median_salary)

df = df.fillna({
      "SALARY_FROM": median_from,
      "SALARY_TO": median_to,
      "SALARY": median_salary
})

df= df.withColumn("Average_Salary", (col("SALARY_FROM") + col("SALARY_TO")) / 2)


export_cols = [
    "EDUCATION_LEVELS_NAME",
    "REMOTE_TYPE_NAME",
    "MAX_YEARS_EXPERIENCE",
    "Average_Salary",
    "SALARY",
    "LOT_V6_SPECIALIZED_OCCUPATION_NAME"
] 

df_selected = df.select(*export_cols)

pdf = df_selected.toPandas()
pdf.to_csv("./data/lightcast_cleaned.csv", index=False)


print("Data Cleaning Complete. Rows retained:", len(pdf))
[Stage 2:>                                                          (0 + 1) / 1]                                                                                [Stage 3:>                                                          (0 + 1) / 1]                                                                                [Stage 4:>                                                          (0 + 1) / 1]                                                                                [Stage 5:>                                                          (0 + 1) / 1]                                                                                
Medians: 87295.0 130042.0 115024.0
Data Cleaning Complete. Rows retained: 72498

2 2. Salary Distribution by Industry and Employment Type

3 Salary Distribution by Employment Type

import pandas as pd


pdf = df.filter(df["SALARY"] > 0).select("EMPLOYMENT_TYPE_NAME", "SALARY").toPandas()
import re
pdf["EMPLOYMENT_TYPE_NAME"] = pdf["EMPLOYMENT_TYPE_NAME"].apply(
    lambda x: re.sub(r"[^\x00-\x7F]+", "", str(x)).strip() if pd.notnull(x) else ""
)

pdf = pdf[pdf["EMPLOYMENT_TYPE_NAME"] != ""]

median_salaries = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()
median_salaries.head()

sorted_employment_types = median_salaries.sort_values(ascending=False).index

pdf["EMPLOYMENT_TYPE_NAME"] = pd.Categorical(
    pdf["EMPLOYMENT_TYPE_NAME"],
    categories=sorted_employment_types,
    ordered=True
)
fig = px.box(
    pdf,
    x="EMPLOYMENT_TYPE_NAME",
    y="SALARY",
    title="Salary Distribution by Employment Type",
    color_discrete_sequence=["#1f77b4"],  
    boxmode="group",
    points="all",
)

fig.update_layout(
    title=dict(
        text="Salary Distribution by Employment Type",
        font=dict(size=16, family="Helvetica", color="black", weight = "bold")
    ),
    
    xaxis=dict(
        title=dict(text="Employment Type", font=dict(size=14, family = "Helvetica", color = "black", weight = "bold")),
        tickangle=0, 
        tickfont=dict(size=12, family = "Arial", color = "black", weight = "bold"),
        showline = True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=False,
        categoryorder="array",
        categoryarray=sorted_employment_types.tolist()
    ),
    yaxis=dict(
    title=dict(text="Salary (K $)", font=dict(size=14, family="Helvetica", color="black", weight="bold")),
    tickvals=[0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000],
    ticktext=["0", "50K", "100K", "150K", "200K", "250K", "300K", "350K", "400K", "450K", "500K"],
    tickfont=dict(size=12, family="Helvetica", color="black", weight="bold"),
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=False,
    gridcolor="lightgray",
    gridwidth=0.5,
    ),
    font=dict(family="Helvetica", size = 12, color = "black"),
    boxgap=0.7,
    plot_bgcolor="white",
    paper_bgcolor="white",
    showlegend=False,
    height=500,
    width=800,


)



fig.show()
fig.write_html("output/Q1.html")
fig.write_image("output/Q1.svg", width=850, height=500, scale=1)
[Stage 6:>                                                          (0 + 1) / 1]                                                                                

4 Salary Distribution by Industry

import pandas as pd
import re
import plotly.express as px

pdf = df.filter(df["SALARY"] > 0).select("NAICS2_NAME", "SALARY").toPandas()

pdf["NAICS2_NAME"] = pdf["NAICS2_NAME"].apply(
    lambda x: re.sub(r"[^\x00-\x7F]+", "", str(x)).strip() if pd.notnull(x) else ""
)

pdf = pdf[pdf["NAICS2_NAME"] != ""]

median_salaries = pdf.groupby("NAICS2_NAME")["SALARY"].median()
sorted_industries = median_salaries.sort_values(ascending=False).index

pdf["NAICS2_NAME"] = pd.Categorical(
    pdf["NAICS2_NAME"],
    categories=sorted_industries,
    ordered=True
)

fig = px.box(
    pdf,
    x="NAICS2_NAME",
    y="SALARY",
    title="Salary Distribution by Industry",
    color_discrete_sequence=["#EF553B"],  # bright red for boxes
    boxmode="group",
    points="all",
)

fig.update_layout(
    title=dict(
        text="Salary Distribution by Industry",
        font=dict(size=30, family="Arial", color="#990000", weight="bold")  # dark red title
    ),
    xaxis=dict(
        title=dict(text="Industry", font=dict(size=14, family="Arial", color="#B22222", weight="bold")),  # firebrick red
        tickangle=45,
        tickfont=dict(size=12, family="Arial", color="#B22222", weight="bold"),
        showline=True,
        linewidth=2,
        linecolor="#B22222",
        mirror=True,
        showgrid=False,
        categoryorder="array",
        categoryarray=sorted_industries.tolist()
    ),
    yaxis=dict(
        title=dict(text="Salary (K $)", font=dict(size=14, family="Arial", color="#800000", weight="bold")),  # maroon
        tickvals=[100000, 200000, 300000, 400000, 500000],
        ticktext=["100K", "200K", "300K", "400K", "500K"],
        tickfont=dict(size=12, family="Arial", color="#800000", weight="bold"),
        showline=True,
        linewidth=2,
        linecolor="#800000",
        mirror=True,
        showgrid=False,
        gridcolor="#F5B7B1",
        gridwidth=0.5,
    ),
    font=dict(family="Arial", size=12, color="#800000"),
    boxgap=0.7,
    plot_bgcolor="#FFF0F0",
    paper_bgcolor="#FFF5F5",
    showlegend=False,
    height=900,
    width=1100,
)

fig.show()
fig.write_html("output/Q2.html")
fig.write_image("output/Q2.svg", width=1100, height=900, scale=1)
[Stage 7:>                                                          (0 + 1) / 1]                                                                                

5 Salary Analysis by ONET Occupation Type (Bubble Chart)

df.createOrReplaceTempView("Job_Postings")
25/09/24 21:08:21 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
salary_analysis = spark.sql("""
    Select
        TITLE_NAME AS ONET_NAME,
        PERCENTILE(SALARY, 0.5) AS Median_Salary,
        COUNT(*) AS Job_Postings
    FROM job_Postings
    GROUP BY TITLE_NAME
    ORDER BY Job_Postings DESC
    LIMIT 10
""")

salary_pd = salary_analysis.toPandas()
salary_pd.head()

import plotly.express as px

fig = px.scatter(
    salary_pd,
    x = "ONET_NAME",
    y="Median_Salary",
    size="Job_Postings",
    title="Median Salary by ONET Occupation Type (Bubble Chart)",
    labels = {"ONET_NAME": "ONET Occupation", "Median_Salary": "Median Salary", "Job_Postings": "NUmber of Job Postings"
    },
    hover_name = "ONET_NAME",
    size_max = 60,
    width=1000,
    height=600,
    color="Job_Postings",
    color_continuous_scale="Plasma"
)
fig.update_layout(
  font_family="Arial",
  font_size = 14,
  title_font_size=25,
  xaxis_title = "ONET Occupation",
  yaxis_title= "Median Salary",
  plot_bgcolor="white",
  xaxis=dict(
    tickangle=-45,
    showline=True,
    linecolor="black"
  ),
  yaxis=dict(
    showline=True,
    linecolor="black"
  )
)


fig.show()
fig.write_html("output/Q3.html")
fig.write_image("output/Q3.svg", width=1100, height=900, scale=1)
[Stage 8:>                                                          (0 + 1) / 1]                                                                                

6 Salary by Education Level

lower_deg = ["Bachelor's", "Associate's", "GED", "No Education Listed", "High School"]
higher_deg = ["Master's Degree", "PHD or professional degree"]

df = df.withColumn(
  "EDU_GROUP",
  when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in lower_deg])), "Bachelor's or lower")
  .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in higher_deg])), "Master's or PHD")
  .otherwise("Other")
)
df= df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))
df= df.withColumn("AVERAGE_SALARY", col("AVERAGE_SALARY").cast("float"))


df = df.filter(
  col("MAX_YEARS_EXPERIENCE").isNotNull() &
  col("AVERAGE_SALARY").isNotNull() &
  (col("MAX_YEARS_EXPERIENCE") > 0) &
  (col("AVERAGE_SALARY") > 0) 


)

df_filtered = df.filter(col("EDU_GROUP").isin("Bachelor's or lower", "Master's or PHD"))

df_pd = df_filtered.toPandas()

import numpy as np


jitter_amount = 0.15  
df_pd['MAX_YEARS_EXPERIENCE_JITTER'] = (
    df_pd['MAX_YEARS_EXPERIENCE'] +
    np.random.uniform(-jitter_amount, jitter_amount, len(df_pd))
)
[Stage 11:>                                                         (0 + 1) / 1]                                                                                

7 Scatter Plot 1

df_bachelor = df_pd[df_pd["EDU_GROUP"]=="Bachelor's or lower"]
fig1 = px.scatter(
  df_bachelor,
  x="MAX_YEARS_EXPERIENCE_JITTER",
  y="AVERAGE_SALARY",
  color = "EDU_GROUP",
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  title="<b>Experience vs Salary by Education Level</b>",
  opacity=0.7,
  color_discrete_sequence=["#636efa"]

)

fig1.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig1.update_layout(
  plot_bgcolor="#f9f9f9",
  paper_bgcolor="#FFF5DC",
  font=dict(family="Segoe UI", size=14),
  title_font=dict(size=22),
  xaxis_title="Years of Experience",
  yaxis_title="Average Salary (USD)",
  legend_title="Education Group",
  hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
  margin=dict(t=70,b=60, l=60, r=60),
  xaxis=dict(
    gridcolor="lightgrey",
    tickmode='linear',
    dtick=1
  ),
  yaxis=dict(gridcolor="lightgrey")
)
fig1.show()
fig1.write_html("outputq_1a_Experience_vs_Salary_by_Education_Level_Bachelors.html")
fig1.write_image("output/q_1a_Experience_vs_Salary_by_Education_Level_Bachelors.svg", width=1100, height=900, scale=1)

8 Scatter Plot 2

df_master = df_pd[df_pd["EDU_GROUP"]=="Master's or PHD"]
fig2 = px.scatter(
  df_master,
  x="MAX_YEARS_EXPERIENCE_JITTER",
  y="AVERAGE_SALARY",
  color = "EDU_GROUP",
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  title="<b>Experience vs Salary by Education Level</b>",
  opacity=0.7,
  color_discrete_sequence=["#FFC0CB"]

)

fig2.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig2.update_layout(
  plot_bgcolor="#f9f9f9",
  paper_bgcolor="#FFF5DC",
  font=dict(family="Segoe UI", size=14),
  title_font=dict(size=22),
  xaxis_title="Years of Experience",
  yaxis_title="Average Salary (USD)",
  legend_title="Education Group",
  hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
  margin=dict(t=70,b=60, l=60, r=60),
  xaxis=dict(
    gridcolor="lightgrey",
    tickmode='linear',
    dtick=1
  ),
  yaxis=dict(gridcolor="lightgrey")
)
fig2.show()
fig2.write_html("outputq_1a_Experience_vs_Salary_by_Education_Level_Masters.html")
fig2.write_image("output/q_1a_Experience_vs_Salary_by_Education_Level_Masters.svg", width=1100, height=900, scale=1)

9 Histogram 1

import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(data=df_pd[df_pd['EDU_GROUP'] == "Bachelor's or lower"],
            x='AVERAGE_SALARY',
            bins=30,
            kde=True,
            color='#FFFF00',
            stat='density',
            alpha=0.5)

plt.title("Bachelor's or Lower Histogram")
plt.xlabel('Average Salary (USD)')
plt.ylabel('Density')
plt.show()
plt.savefig("output/Bachelors_or_Lower_Histogram.svg", bbox_inches='tight')

<Figure size 672x480 with 0 Axes>

10 Histogram 2

import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(data=df_pd[df_pd['EDU_GROUP'] == "Master's or PHD"],
            x='AVERAGE_SALARY',
            bins=30,
            kde=True,
            color='#636efa',
            stat='density',
            alpha=0.5)

plt.title("Master's or PHD Histogram")
plt.xlabel('Average Salary (USD)')
plt.ylabel('Density')
plt.show()
plt.savefig("output/Masters_or_PHD_Histogram.svg", bbox_inches='tight')

<Figure size 672x480 with 0 Axes>

11 Salary by Remote Work Type

from pyspark.sql.functions import when, col, trim

df = df.withColumn("REMOTE_GROUP",
    when(trim(col("REMOTE_TYPE_NAME")) == "Remote", "Remote")
    .when(trim(col("REMOTE_TYPE_NAME")) == "Hybrid Remote", "Hybrid")
    .when(trim(col("REMOTE_TYPE_NAME")) == "Not Remote", "Onsite")
    .when(col("REMOTE_TYPE_NAME").isNull(), "Onsite")
    .otherwise("Onsite")
)

df = df.filter(
  col("MAX_YEARS_EXPERIENCE").isNotNull() &
  col("AVERAGE_SALARY").isNotNull() &
  (col("MAX_YEARS_EXPERIENCE") > 0) &
  (col("AVERAGE_SALARY") > 0) 
)

df_pd = df.select(
    "MAX_YEARS_EXPERIENCE", "Average_Salary",
    "LOT_V6_SPECIALIZED_OCCUPATION_NAME", "REMOTE_GROUP"
).toPandas()

import numpy as np


jitter_amount = 0.15  
df_pd['MAX_YEARS_EXPERIENCE_JITTER'] = (
    df_pd['MAX_YEARS_EXPERIENCE'] +
    np.random.uniform(-jitter_amount, jitter_amount, len(df_pd))
)
[Stage 12:>                                                         (0 + 1) / 1]                                                                                

12 Scatter Plot 1

remote_df = df_pd[df_pd['REMOTE_GROUP'] == 'Remote']
fig1 = px.scatter(
  remote_df,
  x="MAX_YEARS_EXPERIENCE_JITTER",
  y="Average_Salary",
  color = "REMOTE_GROUP",
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  title="<b>Experience vs Salary by Remote Work Type</b>",
  opacity=0.7,
  color_discrete_sequence=["#636efa"]

)

fig1.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig1.update_layout(
  plot_bgcolor="#f9f9f9",
  paper_bgcolor="#FFF5DC",
  font=dict(family="Segoe UI", size=14),
  title_font=dict(size=22),
  xaxis_title="Years of Experience",
  yaxis_title="Average Salary (USD)",
  legend_title="Remote Work Type",
  hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
  margin=dict(t=70,b=60, l=60, r=60),
  xaxis=dict(
    gridcolor="lightgrey",
    tickmode='linear',
    dtick=1
  ),
  yaxis=dict(gridcolor="lightgrey")
)
fig1.show()
fig1.write_html("output/Experience_vs_Salary_by_Remote_Work_Type.html")
fig1.write_image("output/Experience_vs_Salary_by_Remote_Work_Type.svg", width=1100, height=900, scale=1)

13 Scatter Plot 2

hybrid_df = df_pd[df_pd['REMOTE_GROUP'] == 'Hybrid']
fig2 = px.scatter(
  hybrid_df,
  x="MAX_YEARS_EXPERIENCE_JITTER",
  y="Average_Salary",
  color = "REMOTE_GROUP",
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  title="<b>Experience vs Salary by Remote Work Type</b>",
  opacity=0.7,
  color_discrete_sequence=["#00CC96"]

)

fig2.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig2.update_layout(
  plot_bgcolor="#f9f9f9",
  paper_bgcolor="#FFF5DC",
  font=dict(family="Segoe UI", size=14),
  title_font=dict(size=22),
  xaxis_title="Years of Experience",
  yaxis_title="Average Salary (USD)",
  legend_title="Remote Work Type",
  hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
  margin=dict(t=70,b=60, l=60, r=60),
  xaxis=dict(
    gridcolor="lightgrey",
    tickmode='linear',
    dtick=1
  ),
  yaxis=dict(gridcolor="lightgrey")
)
fig2.show()
fig2.write_html("output/Experience_vs_Salary_by_Remote_Work_Type_Hybrid.html")
fig2.write_image("output/Experience_vs_Salary_by_Remote_Work_Type_Hybrid.svg", width=1100, height=900, scale=1)

14 Scatter Plot 3

onsite_df = df_pd[df_pd['REMOTE_GROUP'] == 'Onsite']
fig3 = px.scatter(
  onsite_df,
  x="MAX_YEARS_EXPERIENCE_JITTER",
  y="Average_Salary",
  color = "REMOTE_GROUP",
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  title="<b>Experience vs Salary by Remote Work Type</b>",
  opacity=0.7,
  color_discrete_sequence=["#FFA500"]

)

fig3.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))

fig3.update_layout(
  plot_bgcolor="#f9f9f9",
  paper_bgcolor="#FFF5DC",
  font=dict(family="Segoe UI", size=14),
  title_font=dict(size=22),
  xaxis_title="Years of Experience",
  yaxis_title="Average Salary (USD)",
  legend_title="Remote Work Type",
  hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),
  margin=dict(t=70,b=60, l=60, r=60),
  xaxis=dict(
    gridcolor="lightgrey",
    tickmode='linear',
    dtick=1
  ),
  yaxis=dict(gridcolor="lightgrey")
)
fig3.show()
fig3.write_html("output/Experience_vs_Salary_by_Remote_Work_Type_Onsite.html")
fig3.write_image("output/Experience_vs_Salary_by_Remote_Work_Type_Onsite.svg", width=1100, height=900, scale=1)

15 Histogram 1

import matplotlib.pyplot as plt

remote_salaries = df_pd[df_pd['REMOTE_GROUP'] == 'Remote']['Average_Salary']

plt.figure(figsize=(6, 4))
plt.hist(remote_salaries, bins=15, color='#636efa', alpha=0.7, edgecolor='black')
plt.title(' Remote Salary Distribution')
plt.xlabel('Average Salary (USD)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.7)
plt.show()
plt.savefig("output/Salary_Distribution_Remote.svg")

<Figure size 672x480 with 0 Axes>

16 Histogram 2

import matplotlib.pyplot as plt

remote_salaries = df_pd[df_pd['REMOTE_GROUP'] == 'Hybrid']['Average_Salary']

plt.figure(figsize=(6, 4))
plt.hist(remote_salaries, bins=15, color='#80b131ff', alpha=0.7, edgecolor='black')
plt.title(' Hybrid Salary Distribution')
plt.xlabel('Average Salary (USD)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.7)
plt.show()
plt.savefig("output/Salary_Distribution_Hybrid.svg")

<Figure size 672x480 with 0 Axes>

17 Histogram 3

import matplotlib.pyplot as plt

remote_salaries = df_pd[df_pd['REMOTE_GROUP'] == 'Onsite']['Average_Salary']

plt.figure(figsize=(6, 4))
plt.hist(remote_salaries, bins=15, color='#e21b1eff', alpha=0.7, edgecolor='black')
plt.title(' Onsite Salary Distribution')
plt.xlabel('Average Salary (USD)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.7)
plt.show()
plt.savefig("output/Salary_Distribution_Onsite.svg")

<Figure size 672x480 with 0 Axes>